library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
The following objects are masked from ‘package:reshape’:
colsplit, melt, recast
library(tidyverse)
library(dagitty)
library(ggdag)
Attaching package: ‘ggdag’
The following object is masked from ‘package:gsignal’:
filter
The following object is masked from ‘package:stats’:
filter
library(survival)
library(survminer)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘survminer’
The following object is masked from ‘package:survival’:
myeloma
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Attaching package: ‘GGally’
The following objects are masked from ‘package:miceadds’:
mean0, min0
library(ggpubr)
library(lavaan)
This is lavaan 0.6-16
lavaan is FREE software! Please report any bugs.
Attaching package: ‘lavaan’
The following object is masked from ‘package:psych’:
cor2cov
dat <- read.csv("S1Data.csv")
dat
col_names <- names(dat)
for (i in 1:ncol(dat)) {
hist(dat[,i], main = col_names[i], xlab = col_names[i])
}
Can’t stand the “Pletelets” misspelling. Also convert all binary variables to categorical or boolean.
col_names[col_names == "Pletelets"] <- "Platelets"
names(dat) <- col_names
dat <- dat %>%
mutate(
Gender = factor(Gender, labels = c("Female", "Male")),
across(c(Event, Smoking:Anaemia), as.logical)
)
dat
ggpairs(dat, aes(alpha = 0.01), progress = FALSE)
From the plot above, there seems to be a reasonably strong correlation between smoking and gender.
ggplot(dat, aes(Gender, Smoking)) +
geom_jitter()
chisq.test(dat$Gender, dat$Smoking)
Pearson's Chi-squared test with Yates' continuity correction
data: dat$Gender and dat$Smoking
X-squared = 57.463, df = 1, p-value = 3.444e-14
Apparently barely any women in our data set were smoking.
fit <- survfit(Surv(TIME, Event) ~ Gender, data = dat)
fit
Call: survfit(formula = Surv(TIME, Event) ~ Gender, data = dat)
n events median 0.95LCL 0.95UCL
Gender=Female 105 34 NA 207 NA
Gender=Male 194 62 NA 241 NA
ggsurvplot(fit,
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
linetype = "strata", # Change line type by groups
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"),
ncensor.plot = TRUE)
Warning: Median survival not reached.
cut_offs <- 0:200
names(cut_offs) <- paste0("cut_off", cut_offs)
dat_status <- dat %>%
add_column(!!!cut_offs) %>%
pivot_longer(cols = starts_with("cut_off"),
names_to = "cut_off",
values_to = "cut_off_val") %>%
group_by(cut_off_val) %>%
mutate(
censored = TIME < cut_off_val & Event == 0,
deceased = TIME < cut_off_val & Event == 1,
confirmed_alive = TIME >= cut_off_val
) %>%
summarize(
n_censored = sum(censored),
n_deceased = sum(deceased),
n_confirmed_alive = sum(confirmed_alive)
)
dat_status
df <- pivot_longer(dat_status, n_censored:n_confirmed_alive, names_to = "status")
ggplot(df, aes(cut_off_val, value)) +
geom_line(aes(color = status), linewidth = 1) +
labs(x = "Time (day)", y = "Number of patients") +
scale_color_hue(name = "Patient status", labels = c("Censored", "Alive", "Deceased")) +
theme_minimal()
ggsave(file.path("images", "cut_off.pdf"))
Saving 7.29 x 4.51 in image
There are periods of a sharp increase in censored patients, while the number of deceased patients only climbs gradually. The first jump in censored patients is just after t = 75, so around that time might be a good cut-off point. From the dat_status table, it is clear that we should then pick t = 74, as the only difference between t = 74 and t = 75 is an increase in censored patients.
dat_cut <- dat %>%
filter(!(TIME < 74 & Event == 0)) %>%
rename(Deceased = Event) %>%
select(!TIME)
dat_cut
saveRDS(dat_cut, "dat_cut.rds")
For convenience:
dat_cut <- readRDS("dat_cut.rds")
bn <- dagitty("dag{
Gender -> {Smoking Creatinine Ejection.Fraction}
Age -> {Creatinine Deceased Ejection.Fraction BP}
BP -> {Creatinine Deceased Ejection.Fraction}
Sodium -> Ejection.Fraction
Ejection.Fraction -> Deceased
Creatinine -> Deceased
Smoking -> {Creatinine Deceased Ejection.Fraction BP}
}")
# Just for plotting
bn_plt <- tidy_dagitty(bn, layout = "circle")
node_labels <- c(
"Age" = "Age",
"BP" = "BP",
"Creatinine" = "SC",
"Deceased" = "D",
"Ejection.Fraction" = "EF",
"Gender" = "Sex",
"Smoking" = "Sm",
"Sodium" = "SS"
)
bn_plt <- bn_plt %>%
mutate(
name = node_labels[name],
to = node_labels[to]
)
ggdag(bn_plt) +
theme_dag_blank()
ggsave(file.path('images', 'network_literature.pdf'))
Saving 7.29 x 4.51 in image
impliedConditionalIndependencies(bn)
Age _||_ Gndr
Age _||_ Smkn
Age _||_ Sodm
BP _||_ Gndr | Smkn
BP _||_ Sodm
Crtn _||_ Ej.F | Age, BP, Gndr, Smkn
Crtn _||_ Sodm
Dcsd _||_ Gndr | Age, BP, Crtn, Ej.F, Smkn
Dcsd _||_ Sodm | Age, BP, Ej.F, Gndr, Smkn
Dcsd _||_ Sodm | Age, BP, Crtn, Ej.F, Smkn
Gndr _||_ Sodm
Smkn _||_ Sodm
Convert all boolean/categorical columns to numeric, then run
localTests with type = "cis". This will
regress all variables on their parents, and use the correlation between
the resulting residuals to assess the amount of remaining dependence
between the regressed variables.
dat_cut_num <- dat_cut %>%
mutate(across(where(~ !is.numeric(.x)), as.numeric))
localTests(bn, dat_cut_num, type = "cis")
estimate p.value 2.5% 97.5%
Age _||_ Gndr 0.057165080 0.33569053 -0.05921100 0.17201822
Age _||_ Smkn 0.020080084 0.73548021 -0.09612779 0.13575048
Age _||_ Sodm -0.038445564 0.51758443 -0.15374924 0.07788537
BP _||_ Gndr | Smkn -0.085553405 0.14980894 -0.19977636 0.03094122
BP _||_ Sodm 0.045674193 0.44195634 -0.07068382 0.16081302
Crtn _||_ Ej.F | Age, BP, Gndr, Smkn -0.014453419 0.80921682 -0.13103918 0.10252481
Crtn _||_ Sodm -0.189766158 0.00123129 -0.29933005 -0.07544335
Dcsd _||_ Gndr | Age, BP, Crtn, Ej.F, Smkn -0.061971266 0.30085889 -0.17770702 0.05544317
Dcsd _||_ Sodm | Age, BP, Ej.F, Gndr, Smkn -0.142541050 0.01671445 -0.25536930 -0.02596171
Dcsd _||_ Sodm | Age, BP, Crtn, Ej.F, Smkn -0.096239108 0.10748339 -0.21090498 0.02100985
Gndr _||_ Sodm -0.028803474 0.62790087 -0.14430904 0.08747254
Smkn _||_ Sodm -0.008164811 0.89074907 -0.12403277 0.10792182
Creatinine and sodium have a relatively high correlation estimate and low p value, so they might not be independent after all. It is not the only violated independence assumption, but let’s start by adding an edge from creatinine to sodium.
bn2 <- dagitty("dag{
Gender -> {Smoking Creatinine Ejection.Fraction}
Age -> {Creatinine Deceased Ejection.Fraction BP}
BP -> {Creatinine Deceased Ejection.Fraction}
Sodium -> Ejection.Fraction
Ejection.Fraction -> Deceased
Creatinine -> {Deceased Sodium}
Smoking -> {Creatinine Deceased Ejection.Fraction BP}
}")
ggdag(bn2, layout = "circle")
And run our tests again:
localTests(bn2, dat_cut_num, type = "cis")
estimate p.value 2.5% 97.5%
Age _||_ Gndr 0.057165080 0.3356905 -0.05921100 0.17201822
Age _||_ Smkn 0.020080084 0.7354802 -0.09612779 0.13575048
Age _||_ Sodm | Crtn -0.009785316 0.8694724 -0.12583143 0.10652377
BP _||_ Gndr | Smkn -0.085553405 0.1498089 -0.19977636 0.03094122
BP _||_ Sodm | Crtn 0.045796085 0.4415459 -0.07076764 0.16113311
Crtn _||_ Ej.F | Age, BP, Gndr, Smkn, Sodm 0.021607536 0.7186036 -0.09564719 0.13827369
Dcsd _||_ Gndr | Age, BP, Crtn, Ej.F, Smkn -0.061971266 0.3008589 -0.17770702 0.05544317
Dcsd _||_ Sodm | Age, BP, Crtn, Ej.F, Smkn -0.096239108 0.1074834 -0.21090498 0.02100985
Gndr _||_ Sodm | Crtn -0.028234907 0.6353068 -0.14395389 0.08824199
Smkn _||_ Sodm | Crtn -0.013317464 0.8230285 -0.12930687 0.10302979
This resolved all other independence assumption violations.
Estimates are exactly the same as in the vanilla correlation case, p values are slightly different.
localTests(bn, dat_cut, type = "cis.pillai")
estimate p.value 2.5% 97.5%
Age _||_ Gndr 0.057165080 0.335395305 -0.05921095 0.17200808
Age _||_ Smkn 0.020080084 0.735263761 -0.09612724 0.13574739
Age _||_ Sodm -0.038445564 0.517265994 -0.15374347 0.07788518
BP _||_ Gndr | Smkn -0.085553405 0.148978170 -0.19955670 0.03073502
BP _||_ Sodm 0.045674193 0.441632403 -0.07068370 0.16080578
Crtn _||_ Ej.F | Age, BP, Gndr, Smkn -0.014453419 0.807717433 -0.13021858 0.10170052
Crtn _||_ Sodm -0.189766158 0.001261789 -0.29916507 -0.07544319
Dcsd _||_ Gndr | Age, BP, Crtn, Ej.F, Smkn -0.061971266 0.296280193 -0.17668477 0.05440322
Dcsd _||_ Sodm | Age, BP, Ej.F, Gndr, Smkn -0.142541050 0.015849561 -0.25431998 -0.02700404
Dcsd _||_ Sodm | Age, BP, Crtn, Ej.F, Smkn -0.096239108 0.104334124 -0.20987985 0.01996722
Gndr _||_ Sodm -0.028803474 0.627620782 -0.14430484 0.08747220
Smkn _||_ Sodm -0.008164811 0.890653723 -0.12403081 0.10792084
bn2 <- dagitty("dag{
Gender -> {Smoking Creatinine Ejection.Fraction}
Age -> {Creatinine Deceased Ejection.Fraction BP}
BP -> {Creatinine Deceased Ejection.Fraction}
Sodium -> Ejection.Fraction
Ejection.Fraction -> Deceased
Creatinine -> {Deceased Sodium}
Smoking -> {Creatinine Deceased Ejection.Fraction BP}
}")
ggdag(bn2, layout = "circle")
localTests(bn2, dat_cut, type = "cis.pillai")
estimate p.value 2.5% 97.5%
Age _||_ Gndr 0.057165080 0.3353953 -0.05921095 0.17200808
Age _||_ Smkn 0.020080084 0.7352638 -0.09612724 0.13574739
Age _||_ Sodm | Crtn -0.009785316 0.8691298 -0.12562619 0.10631880
BP _||_ Gndr | Smkn -0.085553405 0.1489782 -0.19955670 0.03073502
BP _||_ Sodm | Crtn 0.045796085 0.4404135 -0.07056216 0.16092477
Crtn _||_ Ej.F | Age, BP, Gndr, Smkn, Sodm 0.021607536 0.7159635 -0.09461303 0.13724703
Dcsd _||_ Gndr | Age, BP, Crtn, Ej.F, Smkn -0.061971266 0.2962802 -0.17668477 0.05440322
Dcsd _||_ Sodm | Age, BP, Crtn, Ej.F, Smkn -0.096239108 0.1043341 -0.20987985 0.01996722
Gndr _||_ Sodm | Crtn -0.028234907 0.6344312 -0.14374762 0.08803685
Smkn _||_ Sodm | Crtn -0.013317464 0.8225687 -0.12910150 0.10282481
r <- c()
for (n in names(bn2)) {
for (p in parents(bn2, n)) {
otherparents <- setdiff(parents(bn2, n), p)
tst <- ciTest(X=n, Y=p, Z=otherparents, dat_cut, type="cis.pillai" )
r <- rbind(r, data.frame(
name=p,
direction="->",
to=n,
cor=tst[,"estimate"],
p=tst[,"p.value"]
))
}
}
r
bn2_tidy <- tidy_dagitty(bn2, layout = "circle")
bn2_tidy$data <- bn2_tidy$data %>%
full_join(r, by = c("name", "direction", "to")) %>%
mutate(
x_text = (x + xend) / 2,
y_text = (y + yend) / 2,
cor = round(cor, 2),
# Some manual adjustments to certain labels clearer
x_text = case_when(
name == "Age" & to == "Ejection.Fraction" ~ 0.1,
name == "BP" & to == "Deceased" ~ -0.354,
TRUE ~ x_text
),
y_text = case_when(
name == "Smoking" & to == "Creatinine" ~ -0.2,
TRUE ~ y_text
)
)
bn2_tidy
# A DAG with 8 nodes and 18 edges
#
ggdag(bn2_tidy) +
geom_label(aes(x_text, y_text, label = cor),
data = filter(bn2_tidy, abs(cor) > 0.01))
Here we need to treat binary variables as numeric. Perhaps it makes more sense to estimate the GGM using linear regressions rather than polychoric correlations?
dat_cut_num <- dat_cut %>%
mutate(
across(c(Deceased:Anaemia), as.numeric),
across(everything(), scale) # Otherwise lavaan complains
)
cov_mat <- lavCor(dat_cut_num)
cov_mat
Decesd Gender Smokng Diabts BP Anaemi Age Ejct.F Sodium Cretnn Pltlts CPK
Deceased 1.000
Gender -0.002 1.000
Smoking -0.003 0.438 1.000
Diabetes 0.001 -0.178 -0.144 1.000
BP 0.074 -0.091 -0.033 -0.007 1.000
Anaemia 0.065 -0.077 -0.107 0.006 0.026 1.000
Age 0.250 0.057 0.020 -0.113 0.084 0.085 1.000
Ejection.Fraction -0.272 -0.159 -0.067 -0.010 0.042 0.064 0.077 1.000
Sodium -0.192 -0.029 -0.008 -0.085 0.046 0.041 -0.038 0.189 1.000
Creatinine 0.295 0.006 -0.026 -0.048 -0.004 0.049 0.153 -0.002 -0.190 1.000
Platelets -0.049 -0.116 0.050 0.107 0.044 -0.052 -0.036 0.079 0.063 -0.040 1.000
CPK 0.059 0.077 0.011 -0.019 -0.067 -0.190 -0.087 -0.040 0.062 -0.025 0.026 1.000
localTests(bn2, sample.cov = cov_mat, sample.nobs = nrow(dat_cut_num))
cov_df <- melt(cov_mat) %>%
rename(
name = Var1,
to = Var2,
poly_cor = value
)
bn2_tidy_poly <- bn2_tidy
bn2_tidy_poly$data <- bn2_tidy_poly$data %>%
left_join(cov_df, by = c("name", "to")) %>%
mutate(poly_cor = round(poly_cor, 2))
bn2_tidy_poly
# A DAG with 8 nodes and 18 edges
#
ggdag(bn2_tidy_poly) +
geom_label(aes(x_text, y_text, label = poly_cor),
data = filter(bn2_tidy_poly, abs(poly_cor) > 0.01))
ggm <- "
Creatinine ~ Gender + Smoking + Age + BP
BP ~ Smoking + Age
Sodium ~ Creatinine
Smoking ~ Gender
Ejection.Fraction ~ BP + Age + Sodium + Smoking + Gender
Deceased ~ Creatinine + BP + Age + Smoking + Ejection.Fraction
"
ggm_sem <- sem(ggm, data = dat_cut_num)
ggm_sem
lavaan 0.6.16 ended normally after 1 iteration
Estimator ML
Optimization method NLMINB
Number of model parameters 24
Number of observations 286
Model Test User Model:
Test statistic 7.053
Degrees of freedom 9
P-value (Chi-square) 0.632
summ <- summary(ggm_sem)
summ
lavaan 0.6.16 ended normally after 1 iteration
Estimator ML
Optimization method NLMINB
Number of model parameters 24
Number of observations 286
Model Test User Model:
Test statistic 7.053
Degrees of freedom 9
P-value (Chi-square) 0.632
Parameter Estimates:
Standard errors Standard
Information Expected
Information saturated (h1) model Structured
Regressions:
Estimate Std.Err z-value P(>|z|)
Creatinine ~
Gender 0.010 0.065 0.157 0.875
Smoking -0.034 0.065 -0.523 0.601
Age 0.154 0.059 2.624 0.009
BP -0.017 0.059 -0.288 0.774
BP ~
Smoking -0.035 0.059 -0.589 0.556
Age 0.085 0.059 1.437 0.151
Sodium ~
Creatinine -0.190 0.058 -3.269 0.001
Smoking ~
Gender 0.438 0.053 8.232 0.000
Ejection.Fraction ~
BP 0.011 0.057 0.190 0.850
Age 0.093 0.057 1.617 0.106
Sodium 0.187 0.057 3.279 0.001
Smoking 0.003 0.064 0.046 0.963
Gender -0.159 0.064 -2.507 0.012
Deceased ~
Creatinine 0.259 0.053 4.897 0.000
BP 0.067 0.053 1.276 0.202
Age 0.228 0.053 4.284 0.000
Smoking -0.019 0.052 -0.355 0.723
Ejection.Frctn -0.293 0.053 -5.570 0.000
Variances:
Estimate Std.Err z-value P(>|z|)
.Creatinine 0.972 0.081 11.958 0.000
.BP 0.988 0.083 11.958 0.000
.Sodium 0.961 0.080 11.958 0.000
.Smoking 0.806 0.067 11.958 0.000
.Ejection.Frctn 0.929 0.078 11.958 0.000
.Deceased 0.779 0.065 11.958 0.000
ggm_est <- summ$pe %>%
filter(op == "~") %>%
select(lhs, rhs, est, pvalue) %>%
rename(sem_est = est, sem_pvalue = pvalue) %>%
mutate(sem_est = round(sem_est, 2))
ggm_est
bn2_tidy_sem <- bn2_tidy
bn2_tidy_sem$data <- bn2_tidy_sem$data %>%
left_join(ggm_est, by = c("name" = "rhs", "to" = "lhs"))
bn2_tidy_sem
# A DAG with 8 nodes and 18 edges
#
ggdag(bn2_tidy_sem) +
geom_label(aes(x_text, y_text, label = sem_est),
data = filter(bn2_tidy_sem, abs(sem_est) > 0.01))
Remove edges with p-values above 0.05.
bn3_tidy_sem <- bn2_tidy_sem
bn3_tidy_sem$data <- bn3_tidy_sem$data %>%
filter(sem_pvalue < 0.05) %>%
# Add deleted nodes back in
add_row(
name = c("Deceased", "Smoking"),
x = c(-7.071068e-01, 0),
y = c(7.071068e-01, -1)
)
ggdag(bn3_tidy_sem) +
geom_label(aes(x_text, y_text, label = sem_est))
Rerun SEM with trimmed graphical model.
ggm2 <- "
Creatinine ~ Age
Sodium ~ Creatinine
Smoking ~ Gender
Ejection.Fraction ~ Sodium + Gender
Deceased ~ Creatinine + Age + Ejection.Fraction
Smoking ~~ 0*Deceased
Age ~~ 0*Gender
"
ggm_sem2 <- sem(ggm2, data = dat_cut_num)
ggm_sem2
lavaan 0.6.16 ended normally after 8 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 15
Number of observations 286
Model Test User Model:
Test statistic 8.235
Degrees of freedom 13
P-value (Chi-square) 0.828
summary(ggm_sem2)
lavaan 0.6.16 ended normally after 8 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 15
Number of observations 286
Model Test User Model:
Test statistic 8.235
Degrees of freedom 13
P-value (Chi-square) 0.828
Parameter Estimates:
Standard errors Standard
Information Expected
Information saturated (h1) model Structured
Regressions:
Estimate Std.Err z-value P(>|z|)
Creatinine ~
Age 0.153 0.058 2.611 0.009
Sodium ~
Creatinine -0.190 0.058 -3.269 0.001
Smoking ~
Gender 0.438 0.053 8.232 0.000
Ejection.Fraction ~
Sodium 0.184 0.057 3.214 0.001
Gender -0.154 0.057 -2.682 0.007
Deceased ~
Creatinine 0.259 0.053 4.875 0.000
Age 0.233 0.053 4.399 0.000
Ejection.Frctn -0.290 0.053 -5.514 0.000
Covariances:
Estimate Std.Err z-value P(>|z|)
.Smoking ~~
.Deceased 0.000
Age ~~
Gender 0.000
Variances:
Estimate Std.Err z-value P(>|z|)
.Creatinine 0.973 0.081 11.958 0.000
.Sodium 0.961 0.080 11.958 0.000
.Smoking 0.806 0.067 11.958 0.000
.Ejection.Frctn 0.937 0.078 11.958 0.000
.Deceased 0.784 0.066 11.958 0.000
Age 0.997 0.083 11.958 0.000
Gender 0.997 0.083 11.958 0.000
summary(ggm_sem2)$pe
ggm2_coef <- summary(ggm_sem2)$pe
ggm2_var <- ggm2_coef %>%
filter(lhs == rhs) %>%
select(rhs, est) %>%
rename(var = est) %>%
mutate(var = round(var, 2))
ggm_est2 <- ggm2_coef %>%
filter(op == "~") %>%
select(lhs, rhs, est, pvalue) %>%
rename(sem_est2 = est, sem_pvalue2 = pvalue) %>%
mutate(sem_est2 = round(sem_est2, 2)) %>%
left_join(ggm2_var, by = c("rhs"))
ggm_est2
Taking just a quick look at the new coefficients. Comparing them with the first GGM shows that they have not changed. Too lazy to fix the missing Smoking node.
bn4_tidy_sem <- bn2_tidy_sem
bn4_tidy_sem$data <- bn4_tidy_sem$data %>%
left_join(ggm_est2, by = c("name" = "rhs", "to" = "lhs"))
ggdag(bn4_tidy_sem %>% filter(is.na(to) | !is.na(sem_est2))) +
geom_label(aes(x_text, y_text, label = sem_est2))
We are keeping in all edges as determined from the literature, even if the GGM would have them removed.
# Scale dat to make sure coefficients have same scale
dat_scaled <- dat %>%
mutate(across(where(is.numeric), function(x) {
if (cur_column() == "TIME") return(x)
scale(x)
}))
cox_m1 <- coxph(Surv(TIME, Event) ~ Creatinine +
Age +
Smoking +
Ejection.Fraction +
BP,
data = dat_scaled)
cox_m1
Call:
coxph(formula = Surv(TIME, Event) ~ Creatinine + Age + Smoking +
Ejection.Fraction + BP, data = dat_scaled)
coef exp(coef) se(coef) z p
Creatinine 0.35894 1.43181 0.06903 5.199 2.00e-07
Age 0.52518 1.69076 0.10747 4.887 1.03e-06
SmokingTRUE -0.01046 0.98959 0.22299 -0.047 0.9626
Ejection.Fraction -0.58755 0.55569 0.11892 -4.941 7.78e-07
BPTRUE 0.47132 1.60211 0.21141 2.229 0.0258
Likelihood ratio test=71.31 on 5 df, p=5.473e-14
n= 299, number of events= 96
summ_cox <- summary(cox_m1)
summ_cox
Call:
coxph(formula = Surv(TIME, Event) ~ Creatinine + Age + Smoking +
Ejection.Fraction + BP, data = dat_scaled)
n= 299, number of events= 96
coef exp(coef) se(coef) z Pr(>|z|)
Creatinine 0.35894 1.43181 0.06903 5.199 2.00e-07 ***
Age 0.52518 1.69076 0.10747 4.887 1.03e-06 ***
SmokingTRUE -0.01046 0.98959 0.22299 -0.047 0.9626
Ejection.Fraction -0.58755 0.55569 0.11892 -4.941 7.78e-07 ***
BPTRUE 0.47132 1.60211 0.21141 2.229 0.0258 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
Creatinine 1.4318 0.6984 1.2506 1.6393
Age 1.6908 0.5914 1.3696 2.0872
SmokingTRUE 0.9896 1.0105 0.6392 1.5320
Ejection.Fraction 0.5557 1.7996 0.4402 0.7015
BPTRUE 1.6021 0.6242 1.0586 2.4246
Concordance= 0.729 (se = 0.029 )
Likelihood ratio test= 71.31 on 5 df, p=5e-14
Wald test = 78.74 on 5 df, p=2e-15
Score (logrank) test = 78.05 on 5 df, p=2e-15
This shows that smoking barely has any effect on survival. Removing the edge between Smoking and Deceased would cause Smoking to no longer have any outgoing edges, so we drop it completely from the network.
cox_m1_fit <- survfit(cox_m1, data = dat_scaled)
# Plot the baseline survival function
ggsurvplot(cox_m1_fit, palette = "#2E9FDF",
ggtheme = theme_minimal())
############# Serum creatinine
crea_df <- with(dat_scaled, {
data.frame(
Age = mean(Age),
Smoking = FALSE,
Ejection.Fraction = mean(Ejection.Fraction),
BP = FALSE,
Creatinine = quantile(Creatinine, c(0.1, 0.5, 0.9))
)
})
g_crea <- ggsurvplot(survfit(cox_m1, newdata = crea_df),
ggtheme = theme_minimal(), data = crea_df) +
labs(title = "A) Survival by serum creatinine levels")
Warning: `gather_()` was deprecated in tidyr 1.2.0.
Please use `gather()` instead.
############# Ejection fraction
ejec_df <- with(dat_scaled, {
data.frame(
Age = mean(Age),
Smoking = FALSE,
Ejection.Fraction = quantile(Ejection.Fraction, c(0.1, 0.5, 0.9)),
BP = FALSE,
Creatinine = mean(Creatinine)
)
})
g_ejec <- ggsurvplot(survfit(cox_m1, newdata = ejec_df),
ggtheme = theme_minimal(), data = ejec_df) +
labs(title = "B) Survival by ejection fraction levels")
############# Age
# Roughly corresponds to 10%, 50%, 90% quantiles
age_scale <- attr(dat_scaled$Age, "scaled:scale")
age_center <- attr(dat_scaled$Age, "scaled:center")
ages = c(45, 60, 75)
age_df <- with(dat_scaled, {
data.frame(
Age = (ages - age_center) / age_scale,
Smoking = FALSE,
Ejection.Fraction = mean(Ejection.Fraction),
BP = FALSE,
Creatinine = mean(Creatinine)
)
})
cox_m1_fit_age <- survfit(cox_m1, newdata = age_df)
g_age <- ggsurvplot(cox_m1_fit_age, legend.labs = ages,
ggtheme = theme_minimal(), data = age_df) +
labs(title = "C) Survival by age")
############# BP
bp_df <- with(dat_scaled, {
data.frame(
Age = mean(Age),
Smoking = FALSE,
Ejection.Fraction = mean(Ejection.Fraction),
BP = c(TRUE, FALSE),
Creatinine = mean(Creatinine)
)
})
g_bp <- ggsurvplot(survfit(cox_m1, newdata = bp_df, legend.labs = c("TRUE", "FALSE")),
ggtheme = theme_minimal(), data = bp_df) +
labs(title = "D) Survival by high blood pressure")
g_bp$plot <- g_bp$plot +
scale_color_hue(labels = c("High", "Low")) +
scale_fill_hue(labels = c("High", "Low"))
quantile(dat$Creatinine, c(0.1, 0.5, 0.9))
10% 50% 90%
0.8 1.1 2.1
quantile(dat$Ejection.Fraction, c(0.1, 0.5, 0.9))
10% 50% 90%
25 38 60
ggarrange(plotlist = lapply(list(g_crea, g_ejec, g_age, g_bp), function (sp) {
sp$plot +
theme(
text = element_text(size = 12),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
}))
ggsave(file.path("images", "surv_strata.pdf"), width = 12, height = 10)
Drop Smoking and merge Cox regression estimates with second set of GGM estimates.
summ_cox$coefficients[,1]
Creatinine Age SmokingTRUE Ejection.Fraction BPTRUE
0.35893780 0.52517818 -0.01046009 -0.58755123 0.47131848
cox_coef <- summ_cox$coefficients[,1]
names(cox_coef) <- c("Creatinine", "Age", "Smoking", "Ejection.Fraction", "BP")
# Recreate tidy DAG to get a new layout
bn_final <- tidy_dagitty(bn2, layout = 'dh')
bn_final$data <- bn_final$data %>%
left_join(ggm_est2, by = c("name" = "rhs", "to" = "lhs")) %>%
mutate(
coef = case_when(
to == "Deceased" & name %in% names(cox_coef) ~ round(cox_coef[name], 2),
TRUE ~ sem_est2
),
var = case_when(
var != 1 ~ var,
TRUE ~ NA_real_
)
) %>%
relocate(coef, .before = xend) %>%
filter((name != "Smoking" & to != "Smoking" & !is.na(coef)) | is.na(to)) %>%
mutate(
name = node_labels[name],
to = node_labels[to]
)
bn_final
# A DAG with 7 nodes and 8 edges
#
ggplot(bn_final, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_dag_point() +
geom_dag_edges(aes(label = coef),
angle_calc = "along",
label_dodge = unit(2.5, 'mm')) +
geom_dag_text(col = "white")
Perhaps it’s clearer if we fix the coordinates ourselves:
coords <- matrix(c(
-10, -11, # Age
-7, -5, # BP
-10, -9, # SC
-7, -8, # D
-10, -5, # EF
-13, -5, # Sex
0, 0, # Smoking [not included]
-10, -7 # So
), ncol = 2, byrow = TRUE)
rownames(coords) <- node_labels
colnames(coords) <- c("x", "y")
bn_tmp <- bn_final$data %>%
mutate(
x = coords[name, "x"],
y = coords[name, "y"],
xend = coords[replace_na(to, "Sm"), "x"],
yend = coords[replace_na(to, "Sm"), "y"]
)
ggplot(bn_tmp, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_dag_point() +
geom_dag_edges(aes(label = coef),
angle_calc = "along",
label_dodge = unit(-2.75, 'mm')) +
geom_dag_text(col = "white") +
geom_label(aes(label = ifelse(is.na(var), NA, paste("sigma^2 ==", var))),
nudge_x = -0.4, nudge_y = 0.7, parse = TRUE, hjust = 1) +
xlim(-15, -5) +
ylim(-11.5, -3.5) +
theme_dag_blank()
ggsave(file.path("images", "network_coefficients.pdf"))
Saving 7.29 x 4.51 in image